Objective

  • Understand crime in WDC by exploring data at both macro and micro levels, make recommendations for 2018, and then make future predictions.

Goals

  1. Find total offenses by each factor group
    • Historically
    • Latest Year 2017
  2. Find total offenses by each offense type/ward group
    • Historically
    • Latest Year 2017
  3. Trend over time (last 3 years)
    • total offenses
    • total offenses by type
  4. Predict total offenses for the next 3 months

Get Data

a = read_csv('raw.csv') %>% 
  clean_names() %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    report_dat,
    month, day, year, hour, minute, second,
    where(is.Date),
    where(is.character),
    where(is.factor),
    where(is.numeric)
  ) %>% slice_sample(prop = 1) #work with 5% of dataset for now

Resources

  1. WDC Wards
  2. psa = police service area

sample data

a %>% sample_n(10)

glimpse structure

a %>% glimpse
## Rows: 342,867
## Columns: 30
## $ report_dat           <chr> "5/8/2012 8:38:00 PM", "11/24/2009 10:23:00 PM...
## $ month                <dbl> 5, 11, 7, 4, 11, 2, 2, 9, 1, 10, 1, 10, 1, 6, ...
## $ day                  <dbl> 8, 24, 15, 26, 29, 24, 27, 20, 5, 17, 9, 22, 3...
## $ year                 <dbl> 2012, 2009, 2014, 2011, 2011, 2017, 2017, 2015...
## $ hour                 <dbl> 20, 22, 21, 11, 16, 2, 14, 21, 11, 4, 12, 3, 1...
## $ minute               <dbl> 38, 23, 5, 30, 10, 38, 39, 30, 55, 53, 40, 4, ...
## $ second               <dbl> 0, 0, 0, 0, 0, 47, 42, 14, 0, 14, 21, 28, 0, 0...
## $ anc                  <chr> "7C", "6C", "6E", "7C", "6C", "2C", "3G", "1B"...
## $ block                <chr> "5400 - 5499 BLOCK OF DIX STREET NE", "700 - 7...
## $ block_group          <chr> "007808 1", "008302 1", "004802 1", "007808 3"...
## $ crimetype            <chr> "Non-Violent", "Non-Violent", "Non-Violent", "...
## $ end_date             <chr> "5/8/2012 9:00:00 PM", "11/24/2009 9:00:00 PM"...
## $ ew                   <chr> "East", "East", "East", "East", "East", "East"...
## $ method               <chr> "OTHERS", "OTHERS", "OTHERS", "OTHERS", "OTHER...
## $ neighborhood_cluster <chr> "Cluster 31", "Cluster 25", "Cluster 8", "Clus...
## $ ns                   <chr> "North", "North", "North", "North", "North", "...
## $ offense              <chr> "BURGLARY", "THEFT F/AUTO", "THEFT/OTHER", "TH...
## $ quad                 <chr> "Northeast", "Northeast", "Northeast", "Northe...
## $ shift                <chr> "EVENING", "EVENING", "EVENING", "DAY", "EVENI...
## $ start_date           <chr> "5/8/2012 12:45:00 PM", "11/24/2009 5:00:00 PM...
## $ voting_precinct      <chr> "Precinct 96", "Precinct 83", "Precinct 18", "...
## $ ccn                  <dbl> 12063047, 9168589, 14402225, 11056847, 1117511...
## $ census_tract         <dbl> 7808, 8302, 4802, 7808, 10600, 5900, 1500, 430...
## $ district             <dbl> 6, 1, 3, 6, 5, 1, 2, 3, 1, 2, 7, 1, 2, 3, 3, 4...
## $ psa                  <dbl> 608, 104, 308, 602, 506, 102, 201, 305, 108, 2...
## $ ward                 <dbl> 7, 6, 6, 7, 6, 2, 4, 1, 6, 3, 8, 2, 2, 2, 6, 1...
## $ x                    <dbl> 242612, 161081, 178473, 217163, 167233, 35440,...
## $ x1                   <dbl> 242612, 161081, 178473, 217163, 167233, 35440,...
## $ xblock               <dbl> -76.92233, -76.99557, -77.01756, -76.92188, -7...
## $ yblock               <dbl> 38.89472, 38.90020, 38.90564, 38.89463, 38.905...

counts of unique levels

a %>% map_df(n_unique)

clean data

#remove unused cols
a = a %>% select(!c(minute, second, anc, block, block_group, end_date, ew, neighborhood_cluster, ns, start_date, voting_precinct, ccn, district, x, x1)) %>%
  mutate(
    report_dat = anytime::anydate(report_dat),
    #start_date = anytime::anydate(start_date),
    #end_date = anytime::anydate(end_date),
    across(where(is.character), factor),
    census_tract = factor(census_tract, levels = a$census_tract %>% unique %>% sort),
    ward = factor(ward, levels = a$ward %>% unique %>% sort),
    psa = factor(psa, levels = a$psa %>% unique %>% sort),
    year = factor(year, levels = a$year %>% unique %>% sort),
    month = factor(month, levels = a$month %>% unique %>% sort),
    day = factor(day, levels = a$day %>% unique %>% sort),
    hour = factor(hour, levels = a$hour %>% unique %>% sort)
    ) %>%
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    month, day, year, hour,
    where(is.character),
    where(is.factor),
    where(is.numeric) 
    ) %>% arrange(report_dat, crimetype, offense)
    
abak = a
#saveRDS(abak, 'cleaned_data_10yrs.RDS')
#a = abak

If we wanted to do more detailed geographic analysis, we might want to include block and block group

EDA: factor vars

sample data

a %>% select(where(is.factor)) %>% sample_n(10)

glimpse structure

a %>% select(where(is.factor)) %>% glimpse
## Rows: 10,594
## Columns: 12
## $ month        <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day          <fct> 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, ...
## $ year         <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, ...
## $ hour         <fct> 3, 13, 8, 21, 16, 20, 20, 7, 11, 3, 8, 13, 8, 21, 17, ...
## $ census_tract <fct> 7809, 9204, 10600, 7603, 3301, 10200, 7803, 9810, 1090...
## $ crimetype    <fct> Non-Violent, Non-Violent, Non-Violent, Non-Violent, No...
## $ method       <fct> OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS...
## $ offense      <fct> BURGLARY, MOTOR VEHICLE THEFT, THEFT F/AUTO, MOTOR VEH...
## $ psa          <fct> 608, 502, 501, 606, 501, 105, 602, 708, 708, 403, 107,...
## $ quad         <fct> Northeast, Northeast, Northeast, Southeast, Northeast,...
## $ shift        <fct> MIDNIGHT, DAY, DAY, EVENING, EVENING, EVENING, EVENING...
## $ ward         <fct> 7, 5, 6, 7, 5, 6, 7, 8, 8, 4, 6, 8, 5, 1, 1, 6, 6, 1, ...

check missing values

a %>% select(where(is.factor)) %>% miss_var_summary

With so little data missing, I feel comfortable omitting rows with NAs

a = a %>% na.omit()

distribution of level counts per factor

#defining custom color palette
jpal = grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(25)

a %>% select(where(is.factor)) %>% map_df(n_unique)
a %>% select(where(is.factor)) %>%
  map_df(n_unique) %>%
  pivot_longer(. , everything(), 'features') %>%
  plot_ly(x = ~features, y = ~value, color = ~features, colors = jpal) %>% 
  add_bars() %>%
  layout(title = 'Unique Level Counts per Factor')

reference: names of unique levels

a %>% select(where(is.factor)) %>% map(unique)
## $month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12
## 
## $day
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $year
## [1] 2012 2013 2014 2015 2016 2017
## Levels: 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 
## $hour
##  [1] 3  13 8  21 16 20 7  11 17 14 9  10 4  18 12 15 2  0  19 6  22 1  23 5 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## 
## $census_tract
##   [1] 7809  9204  10600 7603  3301  10200 7803  9810  10900 2002  8200  7504 
##  [13] 8803  2802  3000  8410  3200  5201  2702  9604  9505  9504  11100 9102 
##  [25] 4002  9700  4802  6600  4400  11000 4600  9803  502   600   8701  9507 
##  [37] 6202  3700  5800  2701  7502  5002  9503  9903  9811  2202  10400 4801 
##  [49] 4100  8804  8904  9603  8903  9203  3100  1702  5600  9509  10100 4902 
##  [61] 7304  1001  7709  202   8100  8402  5500  1901  2900  8302  9601  6801 
##  [73] 2101  9400  9301  7409  7708  4702  3400  6400  7707  400   5001  7401 
##  [85] 4202  10700 2502  6500  1600  2102  4300  7403  7200  9602  100   7404 
##  [97] 3800  3600  9802  8001  9904  5900  7703  2600  3302  8802  7100  9901 
## [109] 2801  300   501   7901  7406  2501  7503  8301  1100  9907  10300 7407 
## [121] 1002  4901  9807  7605  8002  9508  9501  7408  5301  1402  4001  901  
## [133] 3500  8702  7000  2201  10800 2400  7604  1804  10500 9000  9302  1302 
## [145] 7808  1803  9905  1200  7601  9804  2001  9201  7804  4201  9906  9902 
## [157] 9801  4701  7806  702   3900  802   701   7807  6802  6900  2302  6804 
## [169] 2301  7903  201   1401  801   1301  6700  1902  7301  1500  902  
## 179 Levels: 100 201 202 300 400 501 502 600 701 702 801 802 901 902 ... 11100
## 
## $crimetype
## [1] Non-Violent Violent    
## Levels: Non-Violent Violent
## 
## $method
## [1] OTHERS GUN    KNIFE 
## Levels: GUN KNIFE OTHERS
## 
## $offense
## [1] BURGLARY                   MOTOR VEHICLE THEFT       
## [3] THEFT F/AUTO               THEFT/OTHER               
## [5] ASSAULT W/DANGEROUS WEAPON ROBBERY                   
## [7] SEX ABUSE                  ARSON                     
## [9] HOMICIDE                  
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
## 
## $psa
##  [1] 608 502 501 606 105 602 708 403 107 701 506 302 104 409 307 408 603 406 504
## [20] 503 303 706 308 305 707 204 404 102 304 101 702 405 103 505 306 208 507 401
## [39] 207 705 202 605 206 402 601 108 704 703 301 203 106 604 201 407 607 205
## 56 Levels: 101 102 103 104 105 106 107 108 201 202 203 204 205 206 207 ... 708
## 
## $quad
## [1] Northeast Southeast Northwest
## Levels: Northeast Northwest Southeast
## 
## $shift
## [1] MIDNIGHT DAY      EVENING 
## Levels: DAY EVENING MIDNIGHT
## 
## $ward
## [1] 7 5 6 8 4 1 2 3
## Levels: 1 2 3 4 5 6 7 8

Goal 1. Find total offenses by each factor group X

Historically (2012 - 2016)

a %>% filter(year != 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - Historic')
## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories

a %>% count(psa, sort = TRUE, name = 'count') %>%
  head(10) %>%
  mutate(psa = factor(psa)) %>%
  mutate(psa = fct_reorder(psa, -count)) %>% 
  plot_ly(x = ~psa, y = ~count, color = ~psa, colors = jpal) %>% 
  add_bars() %>% 
  layout(
    title = 'Police Service Areas with the Most Crime'
  )

Observations:

  1. Most offenses occurs late summer to early fall (top 3 months: Oct, July, Aug)
  2. More offenses occurs towards the latter half the month (>= Day 16)
  3. More offenses occurs towards the end of the month (top 3 months: July, August, June)
  4. Non-violent offenses occur ~4.5 x as much as violent offenses
  5. Theft is, by far, the most common offense
    • auto-theft is so predominant, it has its own category
  6. The Northeast quad is significantly more dangerous than the other wards
    • OR… we collect data more accurately/frequently for this quad?
  7. Ward 2 is the most dangerous ward by a good margin
    • OR… we collect data more accurately/frequently for this ward?
  8. PSA service areas 207, 208, and 302 have the most crime

Latest Year 2017

a %>% filter(year == 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - 2017')
## 2 columns ignored with more than 50 categories.
## census_tract: 174 categories
## psa: 56 categories

a %>% filter(year == 2017) %>% count(psa, sort = TRUE, name = 'count') %>%
  head(10) %>%
  mutate(psa = factor(psa)) %>%
  mutate(psa = fct_reorder(psa, -count)) %>% 
  plot_ly(x = ~psa, y = ~count, color = ~psa, colors = jpal) %>% 
  add_bars() %>% 
  layout(
    title = 'Police Service Areas with the Most Crime'
  )

quick map

ipal = grDevices::colorRampPalette(brewer.pal(12,'Paired'))(56)

a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~quad, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Quadrants')
a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~ward, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Wards')
a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~psa, colors = ipal) %>% add_markers() %>% layout(title = 'WDC PSAs') %>% hide_legend()

Observations relative to Historic

  1. January 2017 was atypically more dangerous
  2. Non-violent offenses make up a greater percent of total offenses, occurring about ~6 x as much as violent offenses
  3. Wards 5 and 7 are more dangerous than they were historically; 6 is safer
  4. PSA 208 continues to be a dangerous ward

Recommendations 1

Recommendations for the year 2018 based on 2017 findings

  1. Staff More Policemen and women from 2 to 4 pm..
  2. Staff More Policemen and women in ward 2.
  3. Staff More Policemen and women in PSA 208.

Goal 2. Find total offenses by each offense type/ward group

Historically (2012 - 2016)

ggplotly(
  a %>% filter(year != 2017) %>%
    count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
    geom_col() +
    coord_flip() +
    labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - Historic') +
    facet_wrap(~ward) +
    theme(legend.position = 'none')
)

Observations:

  1. Ward 2 has the greatest number of cases as observed above, but this is disproportionately mostly composed of theft/other offenses
    • the offense count of its second largest offense category, theft/auto, is even slightly less than that of Ward 1’s’
  2. Ward 7 has a relatively high proportion of motor vehicle theft offenses
  3. Ward 8 has a relatively high proportion of burglary offenses

Latest Year 2017

ggplotly(
  a %>% filter(year == 2017) %>% 
    count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
    geom_col() +
    coord_flip() +
    labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - 2017') +
    facet_wrap(~ward) +
    theme(legend.position = 'none')
)

Observations relative to Historic

  1. Ward 2’s share of theft f/auto is higher
  2. Relative to other wards, Wards 5,6,7 are more dangerous than they were historically
    • on an absolute basis, all wards are less dangerous
      • see time series graphs below

Recommendations 2

Recommendations for the year 2018 based on 2017 findings

  1. Work on educating citizens on proper theft deterrent methods, especially in ward 2; perhaps install more cameras there.
  2. Looking at wards 7 and 8, it seems like motor vehicle theft may be correlated by assault with a dangerous weapon. + Check out if both offenses occurred simultaneously.

Goal 3. Trend over time

# create col for start of month (a 'month' col) used for grouping and graphing
a = a %>% mutate(
  monthkey = lubridate::make_datetime(
    as.numeric(as.character(year)),
    as.numeric(as.character(month)),
    1)
  ) %>% relocate(report_dat, monthkey, everything())

#Total Offenses over time

Total Offenses by Month

a %>% group_by(monthkey) %>%
  summarise(total.offenses = n()) %>%
  ungroup() %>% 
  plot_ly(x = ~monthkey, y =~total.offenses) %>%
  add_lines(size = I(3)) %>% layout(
  xaxis = list(title = ''),
  yaxis = list(title = ''),
  title = 'Total Offenses by Month'
  )

Observations

  1. Crime follows clear seasonality
    • peaks occurring late summer/early fall
  2. Peaked in mid 2014
  3. Troughed in early 2015

Total Offenses by Month/Type

ggplotly(
  a %>% group_by(monthkey, offense)%>%
    summarise(total.offenses = n()) %>%
    ungroup() %>%
    mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses, fill = offense)) +
    geom_area() +
    labs(title = 'Total Offenses Percentage (#) by Month/Offense Type', x = '', y = '')
)
ggplotly(
  a %>% group_by(monthkey, offense)%>%
    summarise(total.offenses = n()) %>%
    mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>% 
    ungroup() %>% 
    mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses.pct, fill = offense)) +
    geom_area() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = 'Total Offenses Percentage (%) by Month/Offense Type', x = '', y = '')
)

Total Offenses by Month/Ward

ggplotly(
  a %>% group_by(monthkey, ward)%>%
    summarise(total.offenses = n()) %>%
    ungroup() %>%
    mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses, fill = ward)) +
    geom_area() +
    labs(title = 'Total Offenses Percentage (#) by Month/Ward', x = '', y = '')
)
ggplotly(
  a %>% group_by(monthkey, ward)%>%
    summarise(total.offenses = n()) %>%
    mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>% 
    ungroup() %>% 
    mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses.pct, fill = ward)) +
    geom_area() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = 'Total Offenses Percentage (%) by Month/Ward', x = '', y = '')
) 

EDA: num vars

sample data

a %>% select(where(is.numeric)) %>% sample_n(10)

glimpse structure

a %>% select(where(is.numeric)) %>% glimpse
## Rows: 10,565
## Columns: 2
## $ xblock <dbl> -76.92713, -76.99716, -77.00740, -76.94992, -77.01060, -77.0...
## $ yblock <dbl> 38.90078, 38.92093, 38.90645, 38.86128, 38.92023, 38.87786, ...

check missing values

a %>% select(where(is.numeric)) %>% miss_var_summary

Anomaly Detection

Total Offenses by Month

library(anomalize)
## == Use anomalize to improve yo
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)

# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.

# max_anoms: The maximum percent of anomalies permitted to be identified.

a.anomalize = a %>%
  group_by(monthkey) %>% 
  summarise(total.offenses = n()) %>%
  time_decompose(total.offenses, method = 'twitter', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.30, method = 'gesd') %>%
  time_recompose()
## `summarise()` ungrouping output (override with `.groups` argument)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = monthkey
## frequency = 12 months
## median_span = 35.5 months
ggplotly(
a.anomalize %>%
  filter(monthkey < as.Date('2017-11-01')) %>% 
  plot_anomalies(
    ncol = 1,
    alpha_dots = 0.5,
    alpha_circles = 0.5,
    size_circles = 1.5,
    time_recomposed = TRUE,
    alpha_ribbon = 0.05
    ) + scale_y_continuous(labels = comma) +
  labs(x = '', y = 'total.offenses', title = 'total.offenses')
)

There was statistically high crime activity in November 2014 There was statistically low crime activity in August & September 2017

Predict Total Offenses for the 3 Months

Create Model

library(prophet)

#renaming cols to prophet's col conventions
a.agg = a %>%
  #filter(crimetype == 'Violent') %>% 
  group_by(report_dat = round_date(report_dat,'month')) %>% 
  summarise(total.offenses = n()) %>% 
  select(ds = report_dat, y = total.offenses)

#creating model
a.agg.model = a.agg %>% prophet()

#using model make future period df
a.agg.future = a.agg.model %>% make_future_dataframe(
  periods = 3, #next 2 months
  freq = 'month') 

#make forecasts df
a.agg.forecast = a.agg.model %>% predict(a.agg.future)

#plot forecast
a.agg.model %>% dyplot.prophet(a.agg.forecast)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#plot forecast components
a.agg.model %>% prophet_plot_components(a.agg.forecast)

Looking at bottom chart of the component graphs, we can see how early June, late September, and early December are relatively dangerous

Prediction Summary

a.agg.forecast %>% tail(3) %>% select(ds, yhat) %>% 
  rename(month = ds, total_offenses_prediction = yhat)